Loading Library
library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.0
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:formattable':
##
## style
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
library(htmltools)
library(htmlwidgets)
bin062 <- read_csv("bin062_topfiftyhits.txt",
col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 7028 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bin062_eval <- bin062 %>%
filter(as.numeric(evalue) <= 1e-10)
bin062_kingdomcount <- bin062_eval %>%
count(sskingdoms)
bin062_kingdomcount
## # A tibble: 4 × 2
## sskingdoms n
## <chr> <int>
## 1 Archaea 62
## 2 Bacteria 776
## 3 Eukaryota 942
## 4 Viruses 69
piechart_bin062 <- ggplot(bin062_kingdomcount, aes(x="", y=n, fill=sskingdoms)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
scale_fill_brewer(palette = "PiYG") +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
piechart_bin062

bin062_rmduplicates <- bin062_eval %>%
group_by(qseqid) %>%
distinct(sseqid, .keep_all = TRUE)
bin062_eukaryotes <- bin062_rmduplicates %>%
filter(sskingdoms == "Eukaryota") %>%
group_by(qseqid, sseqid, staxids) %>%
slice_min(order_by = evalue) %>%
ungroup() %>%
mutate(genus = word(sscinames, 1))
bin062_eukaryotes
## # A tibble: 552 × 8
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Contig24240559_114 emb|CAH9114… 7.14e-18 186058 Cuscut… Eukary… unnam… Cusc…
## 2 Contig24240559_114 emb|CAI9287… 2.77e-18 75948 Lactuc… Eukary… unnam… Lact…
## 3 Contig24240559_114 emb|VAI3877… 9.77e-18 4567 Tritic… Eukary… unnam… Trit…
## 4 Contig24240559_114 emb|VAI3877… 1.03e-17 4567 Tritic… Eukary… unnam… Trit…
## 5 Contig24240559_114 emb|VAI3878… 2.5 e-18 4567 Tritic… Eukary… unnam… Trit…
## 6 Contig24240559_114 emb|VAI3878… 9.37e-18 4567 Tritic… Eukary… unnam… Trit…
## 7 Contig24240559_114 gb|KAF33316… 5.24e-19 544730 Carex … Eukary… histo… Carex
## 8 Contig24240559_114 gb|KAF36610… 1.08e-17 4072 Capsic… Eukary… Histo… Caps…
## 9 Contig24240559_114 gb|KAF81024… 1.11e-17 3728 Sinapi… Eukary… hypot… Sina…
## 10 Contig24240559_114 gb|KAF86837… 2.02e-18 1010633 Digita… Eukary… hypot… Digi…
## # … with 542 more rows, and abbreviated variable names ¹​sscinames, ²​sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin062_egenus_counts <- bin062_eukaryotes %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(bin062_egenus_counts)
## # A tibble: 84 × 3
## # Groups: qseqid, genus [84]
## qseqid genus count
## <chr> <chr> <int>
## 1 Contig24240559_114 Ambrosia 4
## 2 Contig24240559_114 Camelina 3
## 3 Contig24240559_114 Helianthus 7
## 4 Contig24240559_114 Hordeum 2
## 5 Contig24240559_114 Lolium 7
## 6 Contig24240559_114 Panicum 4
## 7 Contig24240559_114 Triticum 9
## 8 Contig24240559_119 Closterium 2
## 9 Contig24240559_119 Parelaphostrongylus 2
## 10 Contig24240559_13 Cyberlindnera 2
## # … with 74 more rows
# Count the number of times each genus appears for a specific protein sequence
bin062_egenus_counts_3 <- bin062_eukaryotes %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 5) # Only include those with more than one count
print(bin062_egenus_counts_3)
## # A tibble: 14 × 3
## # Groups: qseqid, genus [14]
## qseqid genus count
## <chr> <chr> <int>
## 1 Contig24240559_114 Helianthus 7
## 2 Contig24240559_114 Lolium 7
## 3 Contig24240559_114 Triticum 9
## 4 Contig24240559_138 Brassica 15
## 5 Contig24240559_149 Trichomonas 12
## 6 Contig24240559_158 Astyanax 9
## 7 Contig24240559_158 Daphnia 6
## 8 Contig24240559_158 Helicoverpa 19
## 9 Contig24240559_158 Oreochromis 7
## 10 Contig24240559_57 Lentinula 6
## 11 Contig24240559_89 Naegleria 6
## 12 Contig61333578_11 Rhizophagus 7
## 13 Contig61333578_17 Arachis 9
## 14 Contig61333578_54 Seminavis 27
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue", "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
bin062_eukaryotes_ggplot <- ggplot(bin062_egenus_counts_3, aes(x = genus, y = count)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
theme_minimal() +
theme(text = element_text(size = 25), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin062_eukaryotes_ggplot

bin062_eukaryotes_ggplot <- ggplot(bin062_egenus_counts, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
scale_fill_manual(values = distinct_colors) +
theme_minimal() +
theme(text = element_text(size = 15), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin062_eukaryotes_ggplot

ggplotly(bin062_eukaryotes_ggplot)